﻿using AngleSharp.Parser.Html;
using CefSharp;
using CefSharp.WinForms;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using AngleSharp.Dom;
using AngleSharp.Dom.Html;

namespace PageSpider
{
    public partial class FormMain : Form
    {
        ChromiumWebBrowser _mainWebBrowser;
        ChromiumWebBrowser _subWebBrowser;
        bool _isRunning = false;

        public FormMain()
        {
            InitializeComponent();
        }

        private void FormMain_Load(object sender, EventArgs e)
        {
            _mainWebBrowser = new ChromiumWebBrowser("about:blank");
            _mainWebBrowser.Dock = DockStyle.Fill;
            this.tp_main.Controls.Add(_mainWebBrowser);
            _subWebBrowser = new ChromiumWebBrowser("about:blank");
            _subWebBrowser.Dock = DockStyle.Fill;
            this.tabControl1.SelectedIndex = 1;
            this.tp_sub.Controls.Add(_subWebBrowser);
            this.tabControl1.SelectedIndex = 0;
        }

        private void btn_go_Click(object sender, EventArgs e)
        {
            _mainWebBrowser.GetBrowser().MainFrame.LoadUrl(this.tb_url.Text);
        }

        private void btn_folder_Click(object sender, EventArgs e)
        {
            FolderBrowserDialog fbd = new FolderBrowserDialog()
            {
                ShowNewFolderButton = true,
                Description = "选择保存的文件夹"
            };
            if (fbd.ShowDialog() == DialogResult.OK)
            {
                this.tb_folder.Text = fbd.SelectedPath;
            }
        }

        /// <summary>
        /// 主页选择器
        /// </summary>
        private string MainSelector
        {
            get {
                return this.tb_sel.Text;
            }
        }

        /// <summary>
        /// 主页下一页选择器
        /// </summary>
        private string MainNextSelector
        {
            get
            {
                return this.tb_next_sel.Text;
            }
        }

        /// <summary>
        /// 标题选择器
        /// </summary>
        private string TitleSelector
        {
            get
            {
                return this.tb_title_sel.Text;
            }
        }

        /// <summary>
        /// 次页选择器
        /// </summary>
        private string SubSelector
        {
            get
            {
                return this.tb_sub_sel.Text;
            }
        }

        /// <summary>
        /// 次下选择器
        /// </summary>
        private string SubNextSelector
        {
            get
            {
                return this.tb_sub_next_sel.Text;
            }
        }

        /// <summary>
        /// 次标选择器
        /// </summary>
        private string SubTitleSelector
        {
            get
            {
                return this.tb_sub_title_sel.Text;
            }
        }

        /// <summary>
        /// 翻页数
        /// </summary>
        private int PageCount
        {
            get {
                try
                {
                    return Convert.ToInt32(this.tb_page_count.Text);
                }
                catch (Exception)
                {
                    return 1;
                }
            }
        }

        private void btn_start_Click(object sender, EventArgs e)
        {
            if (string.IsNullOrWhiteSpace(MainSelector) || string.IsNullOrWhiteSpace(TitleSelector))
            {
                WriteLog("错误", "主选择器和主标题选择器不能为空!");
                return;
            }
            if (PageCount > 1 && string.IsNullOrWhiteSpace(MainNextSelector))
            {
                WriteLog("错误", "选择多页时需要下一页按钮选择器!");
                return;
            }
            if (!Directory.Exists(this.tb_folder.Text))
            {
                WriteLog("错误", "没有选定保存文件夹!");
                return;
            }
            _isRunning = true;
            WriteLog("信息", "开始抓取，共需抓取" + PageCount + "页");
            var mainFrame = _mainWebBrowser.GetBrowser().MainFrame;
            var subFrame = _subWebBrowser.GetBrowser().MainFrame;
            Task.Run(() => {
                var tickcount = 3;
                for (int i = 0; i < PageCount; i++)
                {
                    WriteLog("信息", "开始抓取第" + (i+1) + "轮");
                    string html = mainFrame.GetSourceAsync().Result;
                    tickcount = 3;
                    while (string.IsNullOrWhiteSpace(html) && tickcount != 0)
                    {
                        Thread.Sleep(1000);
                        html = mainFrame.GetSourceAsync().Result;
                        tickcount--;
                    }
                    var parser = new HtmlParser();
                    var document = parser.Parse(html);
                    var sel_elements = document.QuerySelectorAll(MainSelector);
                    var title_elements = document.QuerySelectorAll(TitleSelector);
                    if (string.IsNullOrWhiteSpace(SubSelector))
                    {
                        //没有次页选择器，就只保存主页选择器里的内容
                        if (title_elements.Count() == 1 && sel_elements.Count() == 1)
                        {
                            //标题数量为1，选中的要素为1，则把选中要素中的内容保存成一个txt文件
                            var title = title_elements[0].TextContent.Trim().Replace('/', '_').Replace('\\', '_');
                            var dir = Path.Combine(this.tb_folder.Text, title);
                            CreateDirectory(dir);
                            HandleElment(sel_elements[0], dir, title);
                        }
                        else if (title_elements.Count() == 1 && sel_elements.Count() > 1)
                        {
                            //标题数量为1，选中的大于1，则标题创建文件夹，要素中的内容按序号保存（可能是文本也可能是图片）
                            var title = title_elements[0].TextContent.Trim().Replace('/', '_').Replace('\\', '_'); ;
                            var elementCount = sel_elements.Count();
                            var dir = Path.Combine(this.tb_folder.Text, title);
                            CreateDirectory(dir);
                            for (int j = 0; j < elementCount; j++)
                            {
                                HandleElment(sel_elements[j], dir, j.ToString());
                                //手动停止
                                if (!_isRunning)
                                {
                                    break;
                                }
                            }
                        }
                        else
                        {
                            //有次页选择器，就需要在分页浏览器中打开链接，也就是选中的要素必须是a
                            if (title_elements.Count() != sel_elements.Count())
                            {
                                WriteLog("错误", "标题数量与要素数量不匹配!");
                            }
                            else
                            {
                                var elmentCount = sel_elements.Count();
                                for (int j = 0; j < elmentCount; j++)
                                {
                                    var title = title_elements[j].TextContent.Trim().Replace('/', '_').Replace('\\', '_');
                                    var dir = Path.Combine(this.tb_folder.Text, title);
                                    CreateDirectory(dir);
                                    HandleElment(sel_elements[j], dir, title);
                                }
                            }
                        }
                    }
                    else
                    {
                        //有次页选择器，就需要在分页浏览器中打开链接，也就是选中的要素必须是a
                        if (title_elements.Count() != sel_elements.Count())
                        {
                            WriteLog("错误", "标题数量与要素数量不匹配!");
                            continue;
                        }
                        else
                        {
                            var elmentCount = sel_elements.Count();
                            for (int j = 0; j < elmentCount; j++)
                            {
                                var title = title_elements[j].TextContent.Trim().Replace('/', '_').Replace('\\', '_'); ;
                                WriteLog("信息", "开始抓取 " + title);
                                var sel_elment = sel_elements[j];
                                if (sel_elment.TagName.ToLower() != "a")
                                {
                                    WriteLog("错误", "当前选中要素 "+ title + " 不是链接!");
                                    continue;
                                }
                                var dir = Path.Combine(this.tb_folder.Text, title);
                                CreateDirectory(dir);
                                var hrefStr = sel_elment.GetAttribute("href");
                                subFrame.LoadUrl(hrefStr);
                                Thread.Sleep(2000);
                                var subHtml = subFrame.GetSourceAsync().Result;
                                tickcount = 3;
                                while (string.IsNullOrWhiteSpace(subHtml) && tickcount != 0)
                                {
                                    Thread.Sleep(1000);
                                    subHtml = subFrame.GetSourceAsync().Result;
                                    tickcount--;
                                }
                                var subDocument = parser.Parse(subHtml);
                                if (!string.IsNullOrEmpty(SubNextSelector))
                                {
                                    //如果子页面还有分页
                                    var subNextElment = subDocument.QuerySelectorAll(SubNextSelector).LastOrDefault();
                                    var subNextText = subNextElment.TextContent;
                                    var tempNextElment = subNextElment;
                                    while (true)
                                    {
                                        HandleSubPage(subDocument, dir);
                                        //下一页的元素有问题就不要下一页了
                                        tempNextElment = subDocument.QuerySelectorAll(SubNextSelector).LastOrDefault();
                                        if (tempNextElment == null || tempNextElment.Attributes["disabled"] != null || tempNextElment.TextContent != subNextText)
                                        {
                                            break;
                                        }
                                        var subNextUrl = tempNextElment.Attributes["href"].Value;
                                        //网址是绝对路径还是相对路径
                                        if (subNextUrl.ToLower().StartsWith("http://"))
                                        {
                                            subFrame.LoadUrl(subNextUrl);
                                        }
                                        else
                                        {
                                            Uri uri = new Uri(this.tb_url.Text);
                                            Uri uri2 = new Uri(uri, subNextUrl);
                                            subFrame.LoadUrl(uri2.ToString());
                                        }
                                        Thread.Sleep(2000);
                                        subHtml = subFrame.GetSourceAsync().Result;
                                        tickcount = 3;
                                        while (string.IsNullOrWhiteSpace(subHtml) && tickcount != 0)
                                        {
                                            Thread.Sleep(1000);
                                            subHtml = subFrame.GetSourceAsync().Result;
                                            tickcount--;
                                        }
                                        subDocument = parser.Parse(subHtml);
                                        //手动停止
                                        if (!_isRunning)
                                        {
                                            break;
                                        }
                                    }
                                }
                                else
                                {
                                    HandleSubPage(subDocument, dir);
                                }
                                //手动停止
                                if (!_isRunning)
                                {
                                    break;
                                }
                            }
                        }
                    }
                    if (i < PageCount - 1)
                    {
                        var nextElement = document.QuerySelectorAll(MainNextSelector).LastOrDefault();
                        var nextUrl = nextElement.Attributes["href"].Value;
                        if (nextUrl.ToLower().StartsWith("http://"))
                        {
                            mainFrame.LoadUrl(nextUrl);
                        }
                        else
                        {
                            Uri uri = new Uri(this.tb_url.Text);
                            Uri uri2 = new Uri(uri, nextUrl);
                            mainFrame.LoadUrl(uri2.ToString());
                        }
                    }
                    WriteLog("信息", "第" + (i + 1) + "轮抓取完成");
                    Thread.Sleep(2000);
                    //手动停止
                    if (!_isRunning)
                    {
                        break;
                    }
                }
                WriteLog("信息", "抓取完成");
                _isRunning = false;
            });
        }


        /// <summary>
        /// 多线程写日志
        /// </summary>
        /// <param name="title"></param>
        /// <param name="message"></param>
        private delegate void WriteTextDelegate(string title, string message);

        private void WriteLog(string title, string message)
        {
            if (this.tb_log.InvokeRequired)
            {
                this.tb_log.Invoke(new WriteTextDelegate((_title, _message) => {
                    this.tb_log.AppendText(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + "[" + _title + "]：" + _message + "\r\n");
                }), title, message);
            }
            else
            {
                this.tb_log.AppendText(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + "[" + title + "]：" + message + "\r\n");
            }
        }

        /// <summary>
        /// 处理子页面
        /// </summary>
        /// <param name="subDocument"></param>
        /// <param name="dir"></param>
        private void HandleSubPage(IHtmlDocument subDocument,string dir)
        {
            if (string.IsNullOrWhiteSpace(SubTitleSelector))
            {
                //如果没又下级标题，所有文件放在上级文件夹内
                var sub_sel_elements = subDocument.QuerySelectorAll(SubSelector);
                var subElementCount = sub_sel_elements.Count();
                for (int k = 0; k < subElementCount; k++)
                {
                    HandleElment(sub_sel_elements[k], dir, k.ToString());
                }
            }
            else
            {
                //如果有下级标题
                var sub_sel_elements = subDocument.QuerySelectorAll(SubSelector);
                var sub_title_elements = subDocument.QuerySelectorAll(SubTitleSelector);
                if (sub_sel_elements.Count() == 1 && sub_title_elements.Count() == 1)
                {
                    //标题数量为1，选中的要素为1，则把选中要素中的内容保存成一个txt文件
                    var subTitle = sub_title_elements[0].TextContent.Trim().Replace('/', '_').Replace('\\', '_'); ;
                    HandleElment(sub_sel_elements[0], dir, subTitle);
                }
                else if (sub_sel_elements.Count() > 1 && sub_title_elements.Count() == 1)
                {
                    //标题数量为1，选中的大于1，则标题创建文件夹，要素中的内容按序号保存（可能是文本也可能是图片）
                    var subTitle = sub_title_elements[0].TextContent.Trim().Replace('/', '_').Replace('\\', '_'); ;
                    var subElementCount = sub_sel_elements.Count();
                    var subDir = Path.Combine(dir, subTitle);
                    CreateDirectory(subDir);
                    for (int k = 0; k < subElementCount; k++)
                    {
                        HandleElment(sub_sel_elements[k], subDir, k.ToString());
                    }
                }
                else
                {
                    //有次页选择器，就需要在分页浏览器中打开链接，也就是选中的要素必须是a
                    if (sub_title_elements.Count() != sub_sel_elements.Count())
                    {
                        WriteLog("错误", "子标题数量与子要素数量不匹配!");
                    }
                    else
                    {
                        var subElementCount = sub_sel_elements.Count();
                        for (int k = 0; k < subElementCount; k++)
                        {
                            var subTitle = sub_title_elements[k].TextContent.Trim();
                            var subDir = Path.Combine(dir, subTitle);
                            CreateDirectory(subDir);
                            HandleElment(sub_sel_elements[k], subDir, k.ToString());
                        }
                    }
                }

            }
        }

        /// <summary>
        /// 处理要素
        /// </summary>
        /// <param name="element"></param>
        /// <param name="title"></param>
        private void HandleElment(IElement element,string path, string title)
        {
            try
            {
                if (element.TagName.ToLower() == "img")
                {
                    //如果是图片，下载
                    var imgurl = element.GetAttribute("src");
                    var fileName = imgurl.Split('/').Last();
                    fileName = this.tb_prefix.Text + fileName;
                    SaveImageFromWeb(imgurl, path, fileName);
                }
                else
                {
                    //如果是文字，保存
                    var fileName = title + ".txt";
                    fileName = this.tb_prefix.Text + fileName;
                    var content = NoHTML(element.InnerHtml);
                    SaveText(content, path, fileName);
                }
            }
            catch (Exception ex)
            {
                WriteLog("错误", "处理" + title + "时发生错误：" + ex.Message);
            }
        }

        /// <summary>
        /// 保存网络图片
        /// </summary>
        /// <param name="imgUrl"></param>
        /// <param name="path"></param>
        /// <param name="fileName"></param>
        /// <returns></returns>
        private void SaveImageFromWeb(string imgUrl, string path, string fileName)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(imgUrl);
            request.UserAgent = "Mozilla/6.0 (MSIE 6.0; Windows NT 5.1; Natas.Robot)";
            request.Referer = this.tb_url.Text;
            request.Timeout = 3000;

            WebResponse response = request.GetResponse();
            using (Stream stream = response.GetResponseStream())
            {
                if (response.ContentType.ToLower().StartsWith("image/"))
                {
                    byte[] arrayByte = new byte[1024];
                    int imgLong = (int)response.ContentLength;
                    int l = 0;

                    using (FileStream fso = new FileStream(Path.Combine(path, fileName), FileMode.Create))
                    {
                        while (l < imgLong)
                        {
                            int i = stream.Read(arrayByte, 0, 1024);
                            fso.Write(arrayByte, 0, i);
                            l += i;
                        }

                        fso.Close();
                    }
                }
                stream.Close();
            }              
            response.Close();
        }

        /// <summary>
        /// 保存txt
        /// </summary>
        /// <param name="text"></param>
        /// <param name="path"></param>
        /// <param name="fileName"></param>
        private void SaveText(string text,string path,string fileName)
        {
            var fullName = Path.Combine(path, fileName);
            using (FileStream fso = new FileStream(fullName, FileMode.Append))
            {
                StreamWriter sw = new StreamWriter(fso);
                sw.Write(text);
                sw.Flush();
                sw.Close();
                fso.Close();
            }
        }

        /// <summary>
        /// 去除HTML标记字符
        /// </summary>
        /// <param name="Htmlstring"></param>
        /// <returns></returns>
        private string NoHTML(string Htmlstring)
        {
            //删除脚本
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

            //删除HTML
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"<img[^>]*>;", "", RegexOptions.IgnoreCase);
            Htmlstring.Replace("<", "");
            Htmlstring.Replace(">", "");
            Htmlstring.Replace("\r\n", "");
            return Htmlstring.Trim();
        }

        /// <summary>
        /// 创建文件夹
        /// </summary>
        /// <param name="dir"></param>
        /// <returns></returns>
        private void CreateDirectory(string dir)
        {
            try
            {
                Directory.CreateDirectory(dir);
            }
            catch (Exception)
            {
                WriteLog("警告", "文件夹 " + dir + " 已存在!");
            }
        }

        private void btn_stop_Click(object sender, EventArgs e)
        {
            _isRunning = false;
            WriteLog("信息", "停止信号已发出，请耐心等待任务停止!");
        }
    }
}
